Section 5: snRNAseq analysis
snRNAseq analysis
This section uses snRNAseq BAP.d8 dataset (Khan et. al.,) and nTE/nCT dataset (Io et. al.,) to run scRNAseq analyses using Seurat. Breifly, the counts data is imported in R as Seurat object, samples are intergrated, transformed and clustering analyses is performed. Expression of marker genes in each cluster, composition of cell types and PlacentaCellEnrich are then plotted.
Prerequisites
R packages required for this section are loaded
setwd("~/github/amnion.vs.other_RNASeq")
library(Seurat)
library(SeuratWrappers)
library(knitr)
library(kableExtra)
library(ggplot2)
library(cowplot)
library(patchwork)
library(metap)
library(multtest)
library(gridExtra)
library(dplyr)
library(stringr)
library(TissueEnrich)
library(gprofiler2)
library(tidyverse)
library(enhancedDimPlot)
library(calibrate)
library(ggrepel)
library(dittoSeq)
library(ComplexHeatmap)
library(RColorBrewer)
library(pheatmap)
library(scales)
library(plotly)
library(R.utils)Import datasets
The 10X data was already processed with CellRanger (v.4.0.0) and the counts table was ready for us to import for the data analyses. We used the inbuilt function to import this data and to create a Seurat object as described below.
experiment_name = "BAP"
dataset_loc <- "~/TutejaLab/expression-data"
ids <-
c(
"5pcO2_r1",
"5pcO2_r2",
"20pcO2_r1",
"20pcO2_r2",
"nCT_D5",
"nCT_D10",
"nTE_D2",
"nTE_D3"
)
d10x.data <- sapply(ids, function(i) {
d10x <-
Read10X(file.path(dataset_loc, i, "filtered_feature_bc_matrix"))
colnames(d10x) <-
paste(sapply(strsplit(colnames(d10x), split = "-"), '[[' , 1L),
i,
sep ="-")
d10x
})
experiment.data <- do.call("cbind", d10x.data)
bapd8.combined <- CreateSeuratObject(
experiment.data,
project = "BAPd8",
min.cells = 10,
min.genes = 200,
names.field = 2,
names.delim = "\\-"
)Data quality insepction
After the data was imported, we checked the quality of the data. Mitochondrial expression is an important criteria (along with other quantitative features of each nuclei) to decide if the nucleus is good or bad. We tested it as follows
MT ratio in nucleus
bapd8.temp <- bapd8.combined
bapd8.combined$log10GenesPerUMI <-
log10(bapd8.combined$nFeature_RNA) / log10(bapd8.combined$nCount_RNA)
bapd8.combined$mitoRatio <-
PercentageFeatureSet(object = bapd8.combined, pattern = "^MT-")
bapd8.combined$mitoRatio <- bapd8.combined@meta.data$mitoRatio / 100
metadata <- bapd8.combined@meta.data
metadata$cells <- rownames(metadata)
metadata <- metadata %>%
dplyr::rename(
seq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA,
seq_folder = orig.ident
)mt <-
ggplot(dat = metadata, aes(x = nUMI, y = nGene, color = mitoRatio)) +
geom_point(alpha = 0.5) +
scale_colour_gradient(low = "gray90", high = "black") + labs(colour =
"MT ratio") +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_rect(colour = "black")
) +
xlab("RNA counts") + ylab("Gene counts") +
stat_smooth(method = lm) +
facet_wrap( ~ seq_folder, labeller = labeller(
seq_folder =
c(
"20pcO2_r1" = "20% Oxygen (rep1)",
"20pcO2_r2" = "20% Oxygen (rep2)",
"5pcO2_r1" = "5% Oxygen (rep1)",
"5pcO2_r2" = "5% Oxygen (rep2)",
"nCT_D5" = "nCT day 5",
"nCT_D10" = "nCT day 10",
"nTE_D2" = "nTE day 2",
"nTE_D3" = "nTE day 3"
)
)) +
scale_y_continuous(label = comma) +
scale_x_continuous(label = comma)
mtFig 5.1: MT ratio across samples. Cells with higher MT ratio also have less gene counts
Number of nuclei per sample
ncells <- ggplot(metadata, aes(x = seq_folder, fill = seq_folder)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Number of Cells")
ncellsFig 5.2: Number of cells in each sample
Density of nuclei per sample
dcells <-
ggplot(metadata, aes(color = seq_folder, x = nUMI, fill = seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500)
dcellsFig 5.3: Density of cells across samples transcript counts
Number of cells vs. genes
ngenes <-
ggplot(metadata, aes(x = seq_folder, y = log10(nGene), fill = seq_folder)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("NCells vs NGenes")
ngenesFig 5.4: Number of cells vs. total gene coutns
Number of cells vs. transcripts
dtranscripts <-
ggplot(metadata,
aes(x = log10GenesPerUMI, color = seq_folder, fill = seq_folder)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)
dtranscriptsFig 5.5: Number of cells vs. transcripts
Mitochondrial density across samples
mtratio <-
ggplot(metadata, aes(color = seq_folder, x = mitoRatio, fill = seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
mtratio
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Removed 21 rows containing non-finite values (stat_density).Fig 5.6: Mitochondrial density across samples
Data filtering
After inspection, we decided to remove all mitochondiral genes as well as ribosomal genes from our analyses.
set up the metadata file and organize
bapd8.combined <- bapd8.temp
df <- bapd8.combined@meta.data
df$replicate <- NA
df$replicate[which(str_detect(df$orig.ident, "5pcO2"))] <- "5pcO2"
df$replicate[which(str_detect(df$orig.ident, "20pcO2"))] <- "20pcO2"
df$replicate[which(str_detect(df$orig.ident, "nCT_"))] <- "nCT"
df$replicate[which(str_detect(df$orig.ident, "nTE_"))] <- "nTE"
bapd8.combined@meta.data <- df
bapd8.combined[["percent.mt"]] <-
PercentageFeatureSet(bapd8.combined, pattern = "^MT-")Comparing samples QC across Replicates
p <-
VlnPlot(bapd8.combined, features = "nFeature_RNA", pt.size = 1) +
geom_hline(yintercept = 200,
color = "red",
size = 1) +
geom_hline(yintercept = 10000,
color = "red",
size = 1) +
theme(legend.position = "none")
q <- VlnPlot(bapd8.combined, features = "nCount_RNA", pt.size = 1) +
theme(legend.position = "none")
r <- VlnPlot(bapd8.combined, features = "percent.mt", pt.size = 1) +
geom_hline(yintercept = 15,
color = "red",
size = 1) +
theme(legend.position = "none")
panel_plot <-
plot_grid(p,
q,
r,
labels = c("A", "B", "C"),
ncol = 3,
nrow = 1)
panel_plotFig 5.7: Comparing gene counts, transcript counts and MT percent across samples
Filtering
B1 <-
FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "percent.mt")
B2 <-
FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
bapd8.combined <-
subset(bapd8.combined,
subset = nFeature_RNA > 200 &
nFeature_RNA < 10000 & percent.mt < 25)
I1 <-
FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "percent.mt")
I2 <-
FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
bapd8.combined <-
subset(bapd8.combined,
subset = nFeature_RNA > 200 &
nFeature_RNA < 10000 & percent.mt < 15)
A1 <-
FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "percent.mt")
A2 <-
FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
B <- B1 | B2
I <- I1 | I2
A <- A1 | A2
panel_plot <-
plot_grid(
B1,
B2,
I1,
I2,
A1,
A2,
labels = c("A", "B", "C", "D", "E", "F"),
ncol = 2,
nrow = 3
)
panel_plotFig 5.8: Scatter plot showing distribution of cells percent MT vs. mRNA counts, and gene counts vs. mRNA counts. A, B before filtering, C, D preliminary filtering, and E, F final cell content in samples used for analyses.
Removing ribosomal and MT proteins
counts <- GetAssayData(object = bapd8.combined, slot = "counts")
counts <-
counts[grep(pattern = "^MT",
x = rownames(counts),
invert = TRUE), ]
counts <-
counts[grep(pattern = "^MT",
x = rownames(counts),
invert = TRUE), ]
counts <-
counts[grep(pattern = "^RPL",
x = rownames(counts),
invert = TRUE), ]
counts <-
counts[grep(pattern = "^RPS",
x = rownames(counts),
invert = TRUE), ]
counts <-
counts[grep(pattern = "^MRPS",
x = rownames(counts),
invert = TRUE), ]
counts <-
counts[grep(pattern = "^MRPL",
x = rownames(counts),
invert = TRUE), ]
keep_genes <- Matrix::rowSums(counts) >= 10
filtered_counts <- counts[keep_genes,]
bapd8.fcombined <-
CreateSeuratObject(filtered_counts, meta.data = bapd8.combined@meta.data)
bapd8.fcombined@meta.data <- bapd8.fcombined@meta.data[1:4]
bapd8.combined <- bapd8.fcombinedData integration and Clustering
Seurat package was used for integrating samples and running the snRNAseq analyses.
data integration/clustering
(see optimization section below)
bapd8.list <- SplitObject(bapd8.combined, split.by = "orig.ident")
bapd8.list <- lapply(
X = bapd8.list,
FUN = function(x) {
x <- NormalizeData(x)
x <-
FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
}
)
bapd8.anchors <-
FindIntegrationAnchors(object.list = bapd8.list, dims = 1:20)
bapd8.integrated <-
IntegrateData(anchorset = bapd8.anchors, dims = 1:20)
DefaultAssay(bapd8.integrated) <- "integrated"
bapd8.integrated <- ScaleData(bapd8.integrated, verbose = FALSE)
bapd8.integrated <-
RunPCA(bapd8.integrated, npcs = 30, verbose = FALSE)
bapd8.integrated <-
RunUMAP(bapd8.integrated, reduction = "pca", dims = 1:20)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
bapd8.integrated <-
FindNeighbors(bapd8.integrated, reduction = "pca", dims = 1:20)
bapd8.integrated <- FindClusters(bapd8.integrated, resolution = 0.5)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 17208
#> Number of edges: 610547
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8716
#> Number of communities: 13
#> Elapsed time: 1 seconds
num.clusters <- nlevels(bapd8.integrated$seurat_clusters)
num.clusters
#> [1] 13Optimization: Cells/Genes defining PCA (4 PCs)
VizDimLoadings(bapd8.integrated, dims = 1:4, reduction = "pca")Fig 5.9: Genes that define first 4 principal components
Optimization: Determine data dimensionality
bapd8.integrated <- JackStraw(bapd8.integrated, num.replicate = 100)
bapd8.integrated <- ScoreJackStraw(bapd8.integrated, dims = 1:20)
elbow <- ElbowPlot(bapd8.integrated)
jack <- JackStrawPlot(bapd8.integrated, dims = 1:20)
panel_plot <- plot_grid(elbow, jack, labels = c('A', 'B'), ncol = 2)
#> Warning: Removed 28000 rows containing missing values (geom_point).
panel_plotFig 5.10: Data Dimensionality. (A) Elbow plot showing the rankings of PC (first 20) in the PCA (B) JackStraw plot showing the distribution of p-values for each PC (first 20 shown) in the PCA
Renumber the clusters
By default the clusters are numbers 0-12, we need them as 1-13.
df <- bapd8.integrated@meta.data
df$new_clusters <- as.factor(as.numeric(df$seurat_clusters))
bapd8.integrated@meta.data <- df
Idents(bapd8.integrated) <- "new_clusters"Visulaize dimensional reduction
A = enhancedDimPlot(
object = bapd8.integrated,
grouping_var = 'ident',
reduction = "umap",
label = TRUE,
pt.size = 1,
alpha = 0.5
) +
ggtitle("A") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"))
B <- enhancedDimPlot(
object = bapd8.integrated,
grouping_var = 'replicate',
reduction = "umap",
label = FALSE,
pt.size = 1,
alpha = 0.4
) +
ggtitle("B") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(
legend.justification = c(1, 1),
legend.position = c(1, 1),
plot.title = element_text(face = "bold")
) +
scale_colour_manual(
name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])),
expression(paste('5% ', 'O'[2])),
'nCT',
'nTE'),
values = c(
"20pcO2" = "#DA3C96",
"5pcO2" = "#A90065",
"nCT" = "#FFD74D",
"nTE" = "#9BC13C"
)
) +
scale_fill_manual(
name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])),
expression(paste('5% ', 'O'[2])),
'nCT',
'nTE'),
values = c(
"20pcO2" = "#DA3C96",
"5pcO2" = "#A90065",
"nCT" = "#FFD74D",
"nTE" = "#9BC13C"
)
) +
scale_linetype_manual(values = "blank")
C <- enhancedDimPlot(
object = bapd8.integrated,
grouping_var = 'orig.ident',
reduction = "umap",
label = FALSE,
pt.size = 1,
alpha = 0.4
) +
ggtitle("C") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(
legend.justification = c(1, 1),
legend.position = c(1, 1),
plot.title = element_text(face = "bold")
) +
scale_colour_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),
values = c(
"20pcO2_r1" = "#0571b0",
"20pcO2_r2" = "#92c5de",
"5pcO2_r1" = "#ca0020",
"5pcO2_r2" = "#f4a582",
"nCT_D5" = "#d133ff",
"nCT_D10" = "#ff33f6",
"nTE_D2" = "#33ffa2",
"nTE_D3" = "#5bff33"
)
) +
scale_fill_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),
values = c(
"20pcO2_r1" = "#0571b0",
"20pcO2_r2" = "#92c5de",
"5pcO2_r1" = "#ca0020",
"5pcO2_r2" = "#f4a582",
"nCT_D5" = "#d133ff",
"nCT_D10" = "#ff33f6",
"nTE_D2" = "#33ffa2",
"nTE_D3" = "#5bff33"
)
) +
scale_linetype_manual(values = "blank")
panel_plot <- plot_grid(A, B, ncol = 2, nrow = 1)
panel_plotFig 5.11: Dimensional reduction plot showing cells plotted in two dimensions. (A) colored based on cluster identitiy (B) colored based on sample identity
DimPlot(object = bapd8.integrated,
split.by = 'orig.ident',
ncol = 4) +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none")Fig 5.12: Dimensional reduction plot showing distribution of cells in the cluster across samples
Interactive dimensional reduction plot
A = enhancedDimPlot(
object = bapd8.integrated,
grouping_var = 'ident',
reduction = "umap",
label = FALSE,
pt.size = 1,
alpha = 0.5
) +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none")
ggplotly(A)Fig 5.13: Interactive Dimensional reduction plot cells plotted in two dimensions. The colored dots represent individual cells and are assigned based on cluster identity
Finding cluster markers
Cluster markers are defined as FC of >= 1 and p-value (adj) <= 0.05. We will find all markers of for each cluster with a loop.
DefaultAssay(bapd8.integrated) <- "RNA"
for (i in 1:num.clusters) {
try({
cluster.markers.all <- FindMarkers(bapd8.integrated, ident.1 = i)
cluster.markers.filtered <-
cluster.markers.all %>%
filter(avg_log2FC >= 0.584962501) %>%
filter(p_val_adj <= 0.05) %>%
arrange(desc(avg_log2FC))
markers.filtered.names <- rownames(cluster.markers.filtered)
assign(paste("cluster.marker.names", i, sep = "."),
markers.filtered.names)
})
}Cluster cell type composition
fullCounts <- tibble(
cluster = bapd8.integrated@meta.data$new_clusters,
cell_type = bapd8.integrated@meta.data$orig.ident
) %>%
dplyr::group_by(cluster, cell_type) %>%
dplyr::count() %>%
dplyr::group_by(cluster) %>%
dplyr::ungroup() %>%
dplyr::mutate(cluster = paste0("Cluster_", cluster))
fullCounts <- fullCounts %>%
group_by(cell_type) %>%
mutate(cell_type_sum = sum(n)) %>%
mutate(percent = (n * 100) / cell_type_sum)
list2env(split(fullCounts, fullCounts$cluster), envir = .GlobalEnv)
#> <environment: R_GlobalEnv>Bar plot function
mybarplot <- function(pdata, i) {
ggplot(data = pdata,
aes(
x = cell_type,
y = percent,
fill = cell_type,
alpha = 0.5
)) +
geom_bar(stat = "identity") +
theme_classic(base_size = 12) +
theme(legend.position = "none",
axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
)) +
ggtitle(paste("Cluster", i, "cell composition")) +
ylab("% cells in cluster") +
xlab("") +
scale_colour_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),
values = c(
"20pcO2_r1" = "#DA3C96",
"20pcO2_r2" = "#DA3C96",
"5pcO2_r1" = "#A90065",
"5pcO2_r2" = "#A90065",
"nCT_D5" = "#FFD74D",
"nCT_D10" = "#FFD74D",
"nTE_D2" = "#9BC13C",
"nTE_D3" = "#9BC13C"
)
) +
scale_fill_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),
values = c(
"20pcO2_r1" = "#DA3C96",
"20pcO2_r2" = "#DA3C96",
"5pcO2_r1" = "#A90065",
"5pcO2_r2" = "#A90065",
"nCT_D5" = "#FFD74D",
"nCT_D10" = "#FFD74D",
"nTE_D2" = "#9BC13C",
"nTE_D3" = "#9BC13C"
)
) +
scale_linetype_manual(values = "blank")
}Define Colors
define colors for each cluster so that they are standardized.
ggplotColours <- function(n = 6, h = c(0, 360) + 15) {
if ((diff(h) %% 360) < 1)
h[2] <- h[2] - 360 / n
hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}
color_list <- ggplotColours(n = 13)Violin plot function
grouped_violinPlots <-
function(markersfile,
clusternumber,
seuratobject = bapd8.integrated) {
dittoPlotVarsAcrossGroups(
seuratobject,
markersfile,
group.by = "new_clusters",
main = paste("Cluster ", clusternumber, " markers expression"),
xlab = "",
ylab = "Mean z-score expression",
x.labels = c(
"Cluster 1",
"Cluster 2",
"Cluster 3",
"Cluster 4",
"Cluster 5",
"Cluster 6",
"Cluster 7",
"Cluster 8",
"Cluster 9",
"Cluster 10",
"Cluster 11",
"Cluster 12",
"Cluster 13"
),
vlnplot.lineweight = 0.5,
legend.show = FALSE,
jitter.size = 0.5,
color.panel = color_list
)
}PCE plot function
Load the PCE data
l <-
load(file = "~/TutejaLab/PlacentaEnrich/combine-test-expression1.Rdata")
humanGeneMapping <- dataset$GRCH38$humanGeneMapping
d <- dataset$PlacentaDeciduaBloodData
data <- d$expressionData
cellDetails <- d$cellDetailsFunction for running/plotting the PCE
runpce <- function(inputgenelist, clusternumber, shade) {
inputGenes <- toupper(inputgenelist)
humanGene <-
humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes, ]
inputGenes <- humanGene$Gene
expressionData <-
data[intersect(row.names(data), humanGeneMapping$Gene), ]
se <-
SummarizedExperiment(
assays = SimpleList(as.matrix(expressionData)),
rowData = row.names(expressionData),
colData = colnames(expressionData)
)
cellSpecificGenesExp <-
teGeneRetrieval(se, expressedGeneThreshold = 1)
print(length(inputGenes))
gs <- GeneSet(geneIds = toupper(inputGenes))
output2 <- teEnrichmentCustom(gs, cellSpecificGenesExp)
enrichmentOutput <-
setNames(data.frame(assay(output2[[1]]), row.names = rowData(output2[[1]])[, 1]),
colData(output2[[1]])[, 1])
row.names(cellDetails) <- cellDetails$RName
enrichmentOutput$Tissue <-
cellDetails[row.names(enrichmentOutput), "CellName"]
ggplot(data = enrichmentOutput,
mapping = aes(x = reorder (Tissue,-Log10PValue), Log10PValue)) +
geom_bar(stat = "identity",
color = shade,
fill = shade) + theme_classic(base_size = 10) +
theme(
axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1,
size = 10
),
plot.margin = unit(c(1, 1, 1, 3), "cm")
) +
ggtitle(paste0("Cluster ", clusternumber, " PCE")) +
ylab("-log10 p-value (adj.)") +
xlab("") +
scale_y_continuous(expand = expansion(mult = c(0, .1)))
}Run analyses on each cluster
Cluser 1
pce <- runpce(cluster.marker.names.1, 1, color_list[1])
#> [1] 68
count <- mybarplot(Cluster_1, 1)
violin <- grouped_violinPlots(cluster.marker.names.1, 1)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.14: Cluster 1 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 2
pce <- runpce(cluster.marker.names.2, 2, color_list[2])
#> [1] 44
count <- mybarplot(Cluster_2, 2)
violin <- grouped_violinPlots(cluster.marker.names.2, 2)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.15: Cluster 2 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 3
pce <- runpce(cluster.marker.names.3, 3, color_list[3])
#> [1] 244
count <- mybarplot(Cluster_3, 3)
violin <- grouped_violinPlots(cluster.marker.names.3, 3)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.16: Cluster 3 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 4
pce <- runpce(cluster.marker.names.4, 4, color_list[4])
#> [1] 210
count <- mybarplot(Cluster_4, 4)
violin <- grouped_violinPlots(cluster.marker.names.4, 4)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.17: Cluster 4 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 5
pce <- runpce(cluster.marker.names.5, 5, color_list[5])
#> [1] 32
count <- mybarplot(Cluster_5, 5)
violin <- grouped_violinPlots(cluster.marker.names.5, 5)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.18: Cluster 5 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 6
pce <- runpce(cluster.marker.names.6, 6, color_list[6])
#> [1] 355
count <- mybarplot(Cluster_6, 6)
violin <- grouped_violinPlots(cluster.marker.names.6, 6)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.19: Cluster 6 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 7
pce <- runpce(cluster.marker.names.7, 7, color_list[7])
#> [1] 38
count <- mybarplot(Cluster_7, 7)
violin <- grouped_violinPlots(cluster.marker.names.7, 7)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.20: Cluster 7 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 8
pce <- runpce(cluster.marker.names.1, 8, color_list[8])
#> [1] 68
count <- mybarplot(Cluster_8, 8)
violin <- grouped_violinPlots(cluster.marker.names.8, 8)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.21: Cluster 8 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 9
pce <- runpce(cluster.marker.names.1, 9, color_list[9])
#> [1] 68
count <- mybarplot(Cluster_9, 9)
violin <- grouped_violinPlots(cluster.marker.names.9, 9)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.22: Cluster 9 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 10
pce <- runpce(cluster.marker.names.1, 10, color_list[10])
#> [1] 68
count <- mybarplot(Cluster_10, 10)
violin <- grouped_violinPlots(cluster.marker.names.10, 10)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.23: Cluster 10 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 11
pce <- runpce(cluster.marker.names.1, 11, color_list[11])
#> [1] 68
count <- mybarplot(Cluster_11, 11)
violin <- grouped_violinPlots(cluster.marker.names.11, 11)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.24: Cluster 11 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 12
pce <- runpce(cluster.marker.names.1, 12, color_list[12])
#> [1] 68
count <- mybarplot(Cluster_12, 12)
violin <- grouped_violinPlots(cluster.marker.names.12, 12)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.25: Cluster 12 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Cluser 13
pce <- runpce(cluster.marker.names.1, 13, color_list[13])
#> [1] 68
count <- mybarplot(Cluster_13, 13)
violin <- grouped_violinPlots(cluster.marker.names.13, 13)
toprow <-
plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
panel_plot <-
plot_grid(
toprow,
pce,
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
)
panel_plotFig 5.26: Cluster 13 (A) percentage cells of various samples for this cluster, (B) expression of the marker genes across clusters, (C) cell-enrichment results
Comparing STB genes
STB was enriched in cluster 6 and Cluster 8. We will see the expression of the genes in Zohou et. al., dataset.
PCE function to extract STB genes
getSTB <- function(inputgenelist) {
inputGenes <- toupper(inputgenelist)
humanGene <-
humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes <- humanGene$Gene
expressionData <-
data[intersect(row.names(data), humanGeneMapping$Gene),]
se <-
SummarizedExperiment(
assays = SimpleList(as.matrix(expressionData)),
rowData = row.names(expressionData),
colData = colnames(expressionData)
)
cellSpecificGenesExp <-
teGeneRetrieval(se, expressedGeneThreshold = 1)
print(length(inputGenes))
gs <- GeneSet(geneIds = toupper(inputGenes))
output2 <- teEnrichmentCustom(gs, cellSpecificGenesExp)
enrichmentOutput <-
setNames(data.frame(assay(output2[[1]]), row.names = rowData(output2[[1]])[, 1]),
colData(output2[[1]])[, 1])
row.names(cellDetails) <- cellDetails$RName
enrichmentOutput$Tissue <-
cellDetails[row.names(enrichmentOutput), "CellName"]
sct.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.genes <-
humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.genes$Gene), ]
sct.genes <- sct.genes$Gene.name
return(sct.genes)
}Prepare the dataset.
STB.clus.6.all <- getSTB(cluster.marker.names.6)
#> [1] 355
STB.clus.8.all <- getSTB(cluster.marker.names.8)
#> [1] 284
STB.clus.6.n.8 <- unique(c(STB.clus.8.all, STB.clus.6.all))
STB.clus.6.exl <- setdiff(STB.clus.6.all, STB.clus.8.all)
STB.clus.8.exl <- setdiff(STB.clus.8.all, STB.clus.6.all)
link <-
"ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE109nnn/GSE109555/suppl/GSE109555_All_Embryo_TPM.txt.gz"
download.file(link, "GSE109555_All_Embryo_TPM.txt.gz")
gunzip("GSE109555_All_Embryo_TPM.txt.gz")File needs editing to add a column header
sed -i 's/^D8_IVC2_E2_B1_1/genes\tD8_IVC2_E2_B1_1/g' GSE109555_All_Embryo_TPM.txtRead data
cts <-
as.matrix(read.csv(
"GSE109555_All_Embryo_TPM.txt",
sep = "\t",
row.names = "genes"
))
df <- as.data.frame(colnames(cts))
colnames(df) <- "cells"
df$days <- NA
df$days[which(str_detect(df$cells, "D14"))] <- "D14"
df$days[which(str_detect(df$cells, "D12"))] <- "D12"
df$days[which(str_detect(df$cells, "D10"))] <- "D10"
df$days[which(str_detect(df$cells, "D8"))] <- "D8"
df$days[which(str_detect(df$cells, "D6"))] <- "D6"
df$days <- as.factor(df$days)
df <- df %>% remove_rownames %>% column_to_rownames(var = "cells")
STB.clus.6.all.cts <- cts[rownames(cts) %in% STB.clus.6.all,]
STB.clus.8.all.cts <- cts[rownames(cts) %in% STB.clus.8.all,]
STB.clus.6.n.8.cts <- cts[rownames(cts) %in% STB.clus.6.n.8,]
STB.clus.6.exl.cts <- cts[rownames(cts) %in% STB.clus.6.exl,]
STB.clus.8.exl.cts <- cts[rownames(cts) %in% STB.clus.8.exl,]
heat_colors <- brewer.pal(9, "YlOrRd")Heatmap function
mydatahm <- function(mycts, name) {
mycts.t <- as.data.frame(t(mycts))
mycts.t$days <- NA
mycts.t$days[which(str_detect(rownames(mycts.t), "D14"))] <- "D14"
mycts.t$days[which(str_detect(rownames(mycts.t), "D12"))] <- "D12"
mycts.t$days[which(str_detect(rownames(mycts.t), "D10"))] <- "D10"
mycts.t$days[which(str_detect(rownames(mycts.t), "D8"))] <- "D8"
mycts.t$days[which(str_detect(rownames(mycts.t), "D6"))] <- "D6"
mycts.mean <- aggregate(. ~ days, mycts.t, mean)
ordered <-
as.data.frame(t(
mycts.mean %>% remove_rownames %>% column_to_rownames(var = "days")
))
ordered <- ordered[c("D6", "D8", "D10", "D12", "D14")]
hmap <- as.matrix(ordered)
pheatmap(
hmap,
color = heat_colors,
cluster_rows = F,
cluster_cols = F,
show_rownames = T,
border_color = NA,
fontsize = 12,
scale = "row",
fontsize_row = 10
)
}Cluster 6 only STB Heatmap
mydatahm(STB.clus.6.exl.cts, "Cluster 6 only STB genes")Fig 5.27: Heatmap of STB genes present only in cluster 6
Cluster 8 only STB Heatmap
mydatahm(STB.clus.8.exl.cts, "Cluster 8 only STB genes")Fig 5.28: Heatmap of STB genes present only in cluster 8
Session Information
sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19042)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] grid stats4 parallel stats graphics grDevices utils
#> [8] datasets methods base
#>
#> other attached packages:
#> [1] R.utils_2.10.1 R.oo_1.24.0
#> [3] R.methodsS3_1.8.1 plotly_4.9.3
#> [5] scales_1.1.1 pheatmap_1.0.12
#> [7] RColorBrewer_1.1-2 ComplexHeatmap_2.6.2
#> [9] dittoSeq_1.2.6 ggrepel_0.9.1
#> [11] calibrate_1.7.7 MASS_7.3-54
#> [13] enhancedDimPlot_0.0.0.9100 forcats_0.5.1
#> [15] purrr_0.3.4 readr_1.4.0
#> [17] tidyr_1.1.3 tibble_3.1.1
#> [19] tidyverse_1.3.1 gprofiler2_0.2.0
#> [21] TissueEnrich_1.10.1 GSEABase_1.52.1
#> [23] graph_1.68.0 annotate_1.68.0
#> [25] XML_3.99-0.6 AnnotationDbi_1.52.0
#> [27] SummarizedExperiment_1.20.0 GenomicRanges_1.42.0
#> [29] GenomeInfoDb_1.26.7 IRanges_2.24.1
#> [31] S4Vectors_0.28.1 MatrixGenerics_1.2.1
#> [33] matrixStats_0.58.0 ensurer_1.1
#> [35] stringr_1.4.0 dplyr_1.0.7
#> [37] gridExtra_2.3 multtest_2.46.0
#> [39] Biobase_2.50.0 BiocGenerics_0.36.1
#> [41] metap_1.4 patchwork_1.1.1
#> [43] cowplot_1.1.1 ggplot2_3.3.5
#> [45] kableExtra_1.3.4 knitr_1.33
#> [47] SeuratWrappers_0.3.0 SeuratObject_4.0.1
#> [49] Seurat_4.0.2
#>
#> loaded via a namespace (and not attached):
#> [1] scattermore_0.7 bit64_4.0.5
#> [3] irlba_2.3.3 multcomp_1.4-17
#> [5] DelayedArray_0.16.3 data.table_1.14.0
#> [7] rpart_4.1-15 RCurl_1.98-1.3
#> [9] generics_0.1.0 TH.data_1.0-10
#> [11] RSQLite_2.2.7 RANN_2.6.1
#> [13] future_1.21.0 bit_4.0.4
#> [15] mutoss_0.1-12 spatstat.data_2.1-0
#> [17] webshot_0.5.2 xml2_1.3.2
#> [19] lubridate_1.7.10 httpuv_1.6.1
#> [21] assertthat_0.2.1 xfun_0.22
#> [23] hms_1.1.0 jquerylib_0.1.4
#> [25] evaluate_0.14 promises_1.2.0.1
#> [27] fansi_0.4.2 dbplyr_2.1.1
#> [29] readxl_1.3.1 igraph_1.2.6
#> [31] DBI_1.1.1 tmvnsim_1.0-2
#> [33] htmlwidgets_1.5.3 spatstat.geom_2.1-0
#> [35] paletteer_1.3.0 ellipsis_0.3.2
#> [37] crosstalk_1.1.1 RSpectra_0.16-0
#> [39] backports_1.2.1 bookdown_0.22
#> [41] deldir_0.2-10 vctrs_0.3.8
#> [43] SingleCellExperiment_1.12.0 remotes_2.3.0
#> [45] Cairo_1.5-12.2 ROCR_1.0-11
#> [47] abind_1.4-5 cachem_1.0.5
#> [49] withr_2.4.2 sctransform_0.3.2
#> [51] goftest_1.2-2 mnormt_2.0.2
#> [53] svglite_2.0.0 cluster_2.1.2
#> [55] lazyeval_0.2.2 crayon_1.4.1
#> [57] labeling_0.4.2 edgeR_3.32.1
#> [59] pkgconfig_2.0.3 nlme_3.1-152
#> [61] rlang_0.4.11 globals_0.14.0
#> [63] lifecycle_1.0.0 miniUI_0.1.1.1
#> [65] sandwich_3.0-1 mathjaxr_1.4-0
#> [67] modelr_0.1.8 rsvd_1.0.5
#> [69] cellranger_1.1.0 polyclip_1.10-0
#> [71] lmtest_0.9-38 Matrix_1.3-3
#> [73] zoo_1.8-9 reprex_2.0.0
#> [75] ggridges_0.5.3
#> [ reached getOption("max.print") -- omitted 84 entries ]